#!/usr/bin/env bash
# db-assign_sequential_id -- Assigns a unique integer to every row for a table using PostgreSQL sequence generator
# > eval "$(db-parse "$url")"
# > db-assign_sequential_id TABLE ID_COLUMN [PART_ID_COLUMN]
##
set -euo pipefail

[[ $# -gt 0 ]] || usage "$0" "Missing TABLE_CSV"
[[ $# -gt 1 ]] || usage "$0" "Missing ID_COLUMN"

TABLE_CSV=$1 IdColumn=$2 PartColumn=${3:-0}

# http://gpdb.docs.pivotal.io/4390/ref_guide/extensions/pl_python.html
db-supports_pg_lang "plpythonu"

SQL="
    CREATE OR REPLACE FUNCTION seqassign(partid INT, reset BOOL, pids INT[], offsets BIGINT[])
    RETURNS BIGINT AS \$\$
      if reset:
        for i, pid in enumerate(pids):
          SD[pid] = offsets[i]
        return 0

      x = SD[partid]
      SD[partid] += 1
      return x + (partid << 48)
    \$\$ LANGUAGE plpythonu;

    CREATE TEMP TABLE idstats(pid INT, tname TEXT, gpsid INT, total BIGINT)
      DISTRIBUTED BY (pid);
"


for table in $(echo $TABLE_CSV | sed "s/,/ /g")
do
    SQL="$SQL
    INSERT INTO idstats
      SELECT $PartColumn AS pid, '$table', gp_segment_id, COUNT(1)
      FROM $table
      GROUP BY pid, gp_segment_id;"
done

# http://stackoverflow.com/questions/22841206/calculating-cumulative-sum-in-postgresql
# Use larger table idstats_distributor to ensure stats are processed by all segments.
# Assuming 1000 rows cover all segments.
# Update (Jan3): 1000 rows sometimes fail to cover all segments after join, increased to 10000
# Update (Jan6): increased to 1M as ~100shards still cause failure
# (see explain analyze)
SQL="$SQL
CREATE TEMP TABLE idstats_rollup AS
  SELECT pid, gpsid, total, SUM(total) OVER (PARTITION BY pid ORDER BY gpsid) AS cum_total
  FROM (
    SELECT pid, gpsid, SUM(total) as total
    FROM idstats
    GROUP BY pid, gpsid
    ORDER BY pid, gpsid
  ) t
  DISTRIBUTED BY (pid);

CREATE TEMP TABLE idstats_bases AS
  SELECT
    gpsid,
    ARRAY_AGG(pid ORDER BY pid) as pids,
    ARRAY_AGG((cum_total - total)::BIGINT ORDER BY pid) as offsets
  FROM idstats_rollup
  GROUP BY gpsid
  DISTRIBUTED RANDOMLY;

CREATE TEMP TABLE idstats_distributor as
  SELECT generate_series(1, 100000) id
  DISTRIBUTED BY (id);

ANALYZE idstats_bases;
ANALYZE idstats_distributor;

-- Have GP lay down data in a safe way for seqassign()
SET enable_hashjoin=off;
SET enable_mergejoin=off;
SET enable_nestloop=off;

SELECT seqassign(gpsid, true, pids, offsets)
  FROM idstats_bases a, idstats_distributor b
  WHERE a.gpsid = b.gp_segment_id;

"

for table in $(echo $TABLE_CSV | sed "s/,/ /g")
do
    SQL="$SQL
    UPDATE $table SET $IdColumn = seqassign($PartColumn, false, NULL::INT[], NULL::BIGINT[]);"
done


db-execute "$SQL"
